Library

The R packages needed for this entire WGCNA plus BioNERO script.

# basic
library(data.table)
library(tidyverse)
library(dplyr)
library(SummarizedExperiment)
# WGCNA
library(WGCNA)
library(flashClust)
library(BioNERO)
allowWGCNAThreads()
## Allowing multi-threading with up to 8 threads.
# heatmap color
library(colorspace) # I use
library(circlize) # Pablo use
# GO term
library(biomaRt)

WGCNA pipeline

Data preparation (small sample size)
Gene count file is from normalized RUVseq result after regress out the top PC.

## data input
setwd("/Users/cl2375/OneDrive - Yale University/PhD study/Labs/Brennand Lab/RNA-seq/RNAseq_Pablo/WGCNA/Pablo_count")
gene_counts.df <- read.table(file = "normCounts_batch_corrected.txt", head = TRUE, sep = "\t", row.names = 1)
meta.df <- read.csv(file = "Pablo_meta.csv", head = TRUE, sep = ",", row.names = 1)
colnames(gene_counts.df) <- rownames(meta.df)

## data preprocessing
# replace missing value
exp_filt <- replace_na(gene_counts.df)
dim(exp_filt)
## [1] 16954    12
# remove non-expressed data
exp_filt <- remove_nonexp(exp_filt, method = "median", min_exp = 10)
dim(exp_filt)
## [1] 16077    12
# take top 10K variance genes
exp_filt <- filter_by_variance(exp_filt, n = 10000)
dim(exp_filt)
## [1] 10000    12
# filter outlier - could remove 1 outlier sample, but decide to not here since sample size is smaller
#exp_filt <- ZKfiltering(exp_filt, cor_method = "spearman")
#dim(exp_filt)
# adjust for PC
exp_filt <- PC_correction(exp_filt)
dim(exp_filt)
## [1] 10000    12
# set the final data used for following experiment
final_exp <- exp_filt

## explore data
p1 <- plot_heatmap(exp_filt, type = "samplecor", show_rownames = FALSE)
p1

p2 <- plot_PCA(exp_filt, metadata = meta.df[,2:3])
p2

p3 <- plot_heatmap(
    exp_filt[1:50, ], type = "expr", show_rownames = FALSE, show_colnames = TRUE
)
p3

set power

Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
9 0.7060 -0.4660 0.893 193.0 161.0 496
10 0.7600 -0.5150 0.892 169.0 137.0 452
11 0.8100 -0.5590 0.916 150.0 119.0 414
12 0.8360 -0.5980 0.907 135.0 104.0 382
13 0.8670 -0.6290 0.913 121.0 91.8 353

BioNERO WGCNA

# bioNERO data preprocessing
final_exp <- exp_preprocess(
  gene_counts.df, min_exp = 10, cor_method = 'spearman', variance_filter = TRUE, n = 10000)
## Number of removed samples: 1
# network module
#net <- exp2gcn(final_exp, net_type = "signed hybrid", SFTpower = 11, cor_method = "spearman", verbose = TRUE)
#saveRDS(net, file = "net_bioNERO_10kgene.rds")
net <- readRDS("net_bioNERO_10kgene.rds")
# Dendro and colors
plot_dendro_and_colors(net)

# Eigengene networks
plot_eigengene_network(net)

# number of genes per module
plot_ngenes_per_module(net)

## accessing module stability, cannot run with spearman
#module_stability(exp_filt, net, nRuns = 5)
## module-trait correlation
# traits
datTraits <- meta.df
datTraits$donor <- ifelse(datTraits$donor=="SCZ", 1, 0)
datTraits$treatment <- ifelse(datTraits$treatment=="TNF", 1, 0)
datTraits$group_HCP_NS <- ifelse(datTraits$group=="HCP_NS", 1, 0)
datTraits$group_HCP_TNF <- ifelse(datTraits$group=="HCP_TNF", 1, 0)
datTraits$group_SCZ_NS <- ifelse(datTraits$group=="SCZ_NS", 1, 0)
datTraits$group_SCZ_TNF <-ifelse(datTraits$group=="SCZ_TNF", 1, 0)
# set up se for easier run
nGenes = nrow(final_exp)
nSamples = 11
se <- SummarizedExperiment(assays = final_exp, colData = datTraits[-9,c(2:3,6:9)])
# calculate the p-value associated with the correlation
module.trait.correlation = cor(net$MEs, datTraits[-9,c(2:3,6:9)], use = "p", method = "spearman")
module.trait.Pvalue = corPvalueStudent(module.trait.correlation, nSamples)
# will display correlations and their p-values
textMatrix = paste(signif(module.trait.correlation, 2), "\n(",
                   signif(module.trait.Pvalue, 1), ")", sep = "");
dim(textMatrix) = dim(module.trait.correlation)
## plot
# color palette
ramp <- colorRamp2(seq(-2.5, 2.5, 0.1), hcl_palette = 'Blue-Red 3', reverse = FALSE)
colors <- attr(ramp, "colors")
# Display the correlation values within a heatmap plot
par(mar = c(5, 10, 3, 3))# good for R markdown
labeledHeatmap(Matrix = module.trait.correlation,
               xLabels = names(datTraits[-9,c(2:3,6:9)]),
               yLabels = names(net$MEs),
               ySymbols = names(net$MEs),
               colorLabels = FALSE,
               colors = colors,
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.7,
               cex.lab = 1,
               zlim = c(-1,1),
               main = paste("BioNERO Module-trait relationships"))

# Display the correlation values for the four group only
par(mar = c(5, 10, 3, 3))# good for R markdown
labeledHeatmap(Matrix = module.trait.correlation[,3:6],
               xLabels = c("HCP_NS","HCP_TNF","SCZ_NS","SCZ_TNF"),
               yLabels = names(net$MEs),
               ySymbols = names(net$MEs),
               colorLabels = FALSE,
               colors = diverging_hcl(50, palette = "Blue-Red 3"),
               textMatrix = textMatrix[,3:6],
               setStdMargins = FALSE,
               cex.text = 0.7,
               cex.lab = 1,
               zlim = c(-1,1),
               main = paste("BioNERO Module-trait relationships"))

extract genes in modules

# read in annotation file
annotations.df <- read.csv(file="anno.csv", header=TRUE, sep=",", row.names = 1)
## hubgenes
## identify top hub gene
hubgene <- chooseTopHubInEachModule(
  t(final_exp), 
  net$genes_and_modules["Modules"], 
  omitColors = "grey", 
  power = 11, 
  type = "signed hybrid")
# get hubgene symbol
hubgene_symbol <- annotations.df[annotations.df$ensembl %in% hubgene, c("ensembl","Gene_name")]
#write.csv(hubgene_symbol, "bioNERO_hubgene.csv", row.names = FALSE, quote = FALSE)

## identify multiple hub genes in a module
topHubs <- function (datExpr, colorh, omitColors = "grey", power = 2, type = "signed hybrid", ...) 
{
    # modified from chooseTopHubInEachModule, but return the table of all genes connectivity
    isIndex = FALSE
    modules = names(table(colorh))
    if (!is.na(omitColors)[1]) 
        modules = modules[!is.element(modules, omitColors)]
    if (is.null(colnames(datExpr))) {
        colnames(datExpr) = 1:dim(datExpr)[2]
        isIndex = TRUE
    }
    connectivity_table <- data.frame(matrix(ncol = 3)) %>% setNames(c('gene', 'connectivity_rowSums_adj', 'module'))
    hubs = rep(NA, length(modules))
    names(hubs) = modules
    for (m in modules) {
        adj = adjacency(datExpr[, colorh == m], power = power, type = type, ...)
        hub = which.max(rowSums(adj))
        hubs[m] = colnames(adj)[hub]
        sorted_genes <- rowSums(adj) %>% sort(decreasing = T) %>% as.data.frame()  %>%  
                tibble::rownames_to_column() %>% setNames(c('gene', 'connectivity_rowSums_adj')) %>% mutate(module = m)
        connectivity_table <- connectivity_table %>% rbind(sorted_genes)
    }
    if (isIndex) {
        hubs = as.numeric(hubs)
        names(hubs) = modules
    }
    return(connectivity_table %>% na.omit)
}
# get top 10 hub genes per module
hubgenes <- topHubs(t(final_exp), net$genes_and_modules["Modules"], omitColors = "grey", power = 11, type = "signed hybrid") %>% group_by(module) %>% top_n( 10, wt = connectivity_rowSums_adj)
# get the gene symbol of the hub genes
hubgenes_list <- annotations.df[annotations.df$ensembl %in% hubgenes$gene, c("ensembl","Gene_name")]
hubgenes_list <- merge(hubgenes_list, hubgenes, by.x = "ensembl", by.y = "gene", all.x = TRUE)
#write.csv(hubgenes_list, "bioNERO_hubgene_list.csv", row.names = FALSE, quote = FALSE)

## get gene list
# a list of color
color_ME <- colnames(net$MEs)
color <- sub("^ME", "", color_ME)
# function
extract_genes_by_color <- function(color, net, annotations.df) {
  color_genes <- net$genes_and_modules %>%
    filter(Modules == color) %>%
    dplyr::select(Genes)
  
  color_genes_info <- annotations.df %>% 
    filter(ensembl %in% color_genes$Genes) %>%
    dplyr::select(ensembl, Gene_name)
  
  write.csv(color_genes_info, paste0(color, "_genes.csv"), row.names = FALSE, quote = FALSE)

}
# run color
for (i in color){
  extract_genes_by_color(i, net, annotations.df)
}
#extract_genes_by_color("mediumpurple1", net, annotations.df)

GO term enrichment

library(clusterProfiler)
## Registered S3 method overwritten by 'ggtree':
##   method         from     
##   fortify.igraph ggnetwork
## clusterProfiler v4.10.1  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
## 
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
## 
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:biomaRt':
## 
##     select
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:purrr':
## 
##     simplify
## The following object is masked from 'package:stats':
## 
##     filter
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:clusterProfiler':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
## 
library(AnnotationDbi)
# get module genes
setwd("/Users/cl2375/OneDrive - Yale University/PhD study/Labs/Brennand Lab/RNA-seq/RNAseq_Pablo/WGCNA/Pablo_count/ModuleCal_v3/bioNERO_ModuleGene_v2")
GO_list <- read.csv(file = "brown_genes.csv", head = TRUE)
# GO term enrichment
GO_results <- enrichGO(gene = GO_list[,1], OrgDb = "org.Hs.eg.db", keyType = "ENSEMBL", ont = "BP", pvalueCutoff = 0.05, 
                       pAdjustMethod = "BH", rownames(exp_filt))
# reduce GO term redundancy
GO_results_sim <- simplify(GO_results)
# plot out top GO
plot(barplot(GO_results, showCategory = 20))

# plot out reduced redundancy top GO
plot(barplot(GO_results_sim, showCategory = 20))

No significant GO terms found for antiquewhite1_genes No significant GO terms found for bisque4_genes No significant GO terms found for blue_genes No significant GO terms found for blue3_genes No significant GO terms found for coral1_genes No significant GO terms found for coral4_genes No significant GO terms found for darkorange2_genes No significant GO terms found for darkred_genes No significant GO terms found for darkseagreen2_genes No significant GO terms found for darkslateblue_genes No significant GO terms found for green_genes No significant GO terms found for green4_genes No significant GO terms found for honeydew_genes No significant GO terms found for honeydew1_genes No significant GO terms found for indianred4_genes No significant GO terms found for lightpink2_genes No significant GO terms found for lightsteelblue1_genes No significant GO terms found for magenta_genes No significant GO terms found for navajowhite1_genes No significant GO terms found for orangered4_genes No significant GO terms found for palevioletred_genes No significant GO terms found for palevioletred1_genes No significant GO terms found for tan4_genes No significant GO terms found for tomato_genes